#!/cray/css/users/bavier/bin/guile -s
!#

;;
;; Procedures for manipulating data stored in a
;; 2D-like structure, where a row represents a
;; "record", and records may have a number of
;; "attributes" or columns
;;
(define-module (data-mining dataset)
  #:use-module (data-mining type-conversions)
  #:use-module (data-mining util)
  #:use-module (data-mining hash-util)
  #:use-module (data-mining attributes)
  #:use-module (data-mining indexed-matrix)
  #:use-module (srfi srfi-1) ;;autoload (srfi srfi-1) (first second third last every)
  #:use-module (srfi srfi-9)
  #:use-module (srfi srfi-19) ;;(date->string time-utc->date)
  #:use-module (srfi srfi-26) ;;(cut cute)
  #:use-module (ice-9 receive) ;;(receive)
  #:use-module (ice-9 rdelim) ;;(read-line write-line)
  #:use-module (ice-9 regex) ;;(string-match make-regexp regexp-exec)
  #:export (make-dataset
            make-dataset/shared

            dataset-length
            dataset-width
            dataset-empty?

            ;; arff->dataset
            delimited->dataset
            dataset->delimited

            dataset-set!
            dataset-ref
            dataset-filter
            dataset-partition-records

            dataset-attributes
            dataset-attribute
            dataset-attribute-ref
            dataset-attribute-set!

            dataset-attribute-indices
            dataset-attribute-values
            dataset-label-idx
            dataset-label-attribute-values
            dataset-record-indices

            dataset-derive-attribute!
            dataset-entry-value-alist))

(define author    "Eric Bavier <bavier@member.fsf.org>")
(define date      "2014 June 3")
(define copyright "GPLv3+")
;;
;; This is free software released under the GPLv3, or later
;;

;; =================================================
;;                  Dataset Object
;; =================================================

(define-record-type dataset
  (make-dataset*
   entries                              ;2D matrix containing dataset
                                        ;values

   attribute-table                      ;A collection of the attributes
                                        ;describing each column of the
                                        ;dataset

   label-idx                            ;The name of the label attribute
   )
  dataset?
  (entries         dataset-entries)
  (attribute-table dataset-attribute-table)
  (label-idx       dataset-label-idx  set-dataset-label-idx!))

(define (attribute-list->attributes lst)
  (fold
   ;; Allow a mixing of simple symbol declarations
   ;; for attributes and concrete attribute types.
   (lambda (e hash)
     (cond
      ((symbol? e)
       (let ((attr (symbol->attribute e)))
         (hash-set!
          hash (attribute-name attr) attr)))
      ((attribute? e)
       (hash-set! hash (attribute-name e) e)))
     hash)
   (make-hash-table)
   lst))
(define* (make-dataset attributes
                       label-idx
                       #:optional
                       (entries (make-indexed-matrix)))
  (let ((attributes* (cond
                      ((hash-table? attributes) attributes)
                      (else (attribute-list->attributes attributes)))))
    (make-dataset* entries attributes* label-idx)))

(define* (make-dataset/shared dataset
                              #:key
                              (rows (dataset-record-indices dataset))
                              (columns (dataset-attribute-indices dataset)))
  ;; Make sure the dataset label attribute is included in the columns
  (let* ((columns* (lset-adjoin equal?
                                columns
                                (dataset-label-idx dataset)))
         (attribute-table* (hash-subset (dataset-attribute-table dataset)
                                        columns*)))
    (make-dataset
     attribute-table*
     (dataset-label-idx dataset)
     (make-indexed-matrix/shared (dataset-entries dataset)
                                 #:row-indices rows
                                 #:column-indices columns*))))

;;; Set the value of an entry in DS.  We assume that ENTRY has already
;;; been vetted by the attribute it is being set for (i.e. no scrubbing
;;; takes place).
(define (dataset-set! ds entry rec-idx attr-idx)
  (indexed-matrix-set!
   (dataset-entries ds)
   entry rec-idx attr-idx))

(define (dataset-ref ds rec-idx attr-idx)
  (indexed-matrix-ref (dataset-entries ds) rec-idx attr-idx))

;;; Filter the entries of DS simultaneously on the contents of each record
;;; (row) as well as the contents of each attribute (column).
;;;
;;; The procedure RECORD-PRED is applied as (record-pred row-idx entry-alist)
;;; where row-idx is the unique identifier for the current row, and
;;; entry-alist is an association list where the keys are the attribute
;;; names and the values are the values each attribute assumes for the current
;;; record.  Analogously for ATTRIBUTE-PRED.  RECORD-PRED and ATTRIBUTE-PRED
;;; may be #t, in which case all rows or columns, respectively, are returned
;;; in the resulting dataset.
;;;
;;; Note: It is not possible to filter out the label attribute
;;;
(define* (dataset-filter ds
                         #:key
                         (record-pred #t)
                         (attribute-pred #t))
  (let ((recs
         (if (eq? record-pred #t)
             (dataset-record-indices ds)
             (filter identity           ;filter out those that return #f
                     (indexed-matrix-map-indexed-rows
                      (lambda (i e) (and (record-pred i e) i))
                      (dataset-entries ds)))))
        (attrs
         (if (eq? attribute-pred #t)
             (dataset-attribute-indices ds)
             (filter identity           ;filter out those that return #f
                     (indexed-matrix-map-indexed-columns
                      (lambda (j e) (and (attribute-pred j e) j))
                      (dataset-entries ds))))))
    (make-dataset/shared ds #:rows recs #:columns attrs)))

;;; Returns a list of datasets of length (1+ (length
;;; preds)).  The records in d1 are those records
;;; which satisfy p1, the records in d2 are those
;;; which satisfay p2 but not p1, etc.  The last
;;; dataset in the result contains those records
;;; which do not satisfy any of the predicates in
;;; PREDS.
;;;
;;; Each predicate in PREDS is applied as in
;;; dataset-filter
(define (dataset-partition-records preds ds)
  (if (null? preds) (list ds)
      (reverse!
       (map
        (lambda (alist-ds)
          (make-dataset/shared ds #:rows (map car alist-ds)))
        (fold (lambda (p l)
                (let* ((part (fold (lambda (e acc)
                                     (if (p (car e) (cdr e))
                                         (cons (cons e (car acc))
                                               (cdr acc))
                                         (cons (car acc)
                                               (cons e (cdr acc)))))
                                   '(() . ())
                                   (car l)))
                       (in (reverse! (car part)))
                       (out (reverse! (cdr part))))
                  (cons out (cons in (cdr l)))))
              (list (indexed-matrix-map-indexed-rows
                     cons (dataset-entries ds)))
              preds)))))

;; Lookup an attribute ATTR in DS by name or index.  Return #f if none found
;; by the given index.
(define (dataset-attribute-ref ds attr-idx)
  (hash-ref (dataset-attribute-table ds) attr-idx))

(define (dataset-attribute-set! ds attr-idx attr)
  (hash-set! (dataset-attribute-table ds) attr-idx attr))

(define dataset-attribute
  (make-procedure-with-setter dataset-attribute-ref
                              dataset-attribute-set!))

;;; Return a reference to the label attribute
;; (define (dataset-label-attribute-ref ds)
;;   (dataset-attribute-ref ds (dataset-label-idx ds)))

;; (define (dataset-label-attribute-set! ds attr)
;;   (let ((label-idx (dataset-label-idx ds))
;;      (label-idx* (attribute-name attr)))
;;     (hash-remove! (dataset-attribute-table ds) label-idx)
;;     (hash-set!    (dataset-attribute-table ds) lavel-idx* attr)
;;     ;; Swap row references to labels from the old tag to the new
;;     (indexed-matrix-reindex-column! (entries ds) oldtag newtag)
;;     (set-dataset-label-idx! ds label-idx*)))

;; (define dataset-label-attribute
;;   (make-procedure-with-setter dataset-label-attribute-ref
;;                            dataset-label-attribute-set!))

;;; Return a list of the concrete attribute validators of DS.
(define (dataset-attributes ds)
  (hash-map->list (lambda (_ b) b) (dataset-attribute-table ds))
  (map (cute hash-ref (dataset-attribute-table ds) <>)
       (dataset-attribute-indices ds)))

(define* (dataset-attribute-indices ds
				    #:key
				    (with-label #t))
  (let ((indices (hash-map->list (lambda (i _) i)
				 (dataset-attribute-table ds))))
    (if with-label
	indices
	(remove (cute equal? <> (dataset-label-idx ds))
		indices))))

;;; Return a list of values belonging to the named attribute
(define (dataset-attribute-values ds attr-idx)
  (indexed-matrix-column-entries (dataset-entries ds) attr-idx))

;;; Return a list of the label values
(define (dataset-label-attribute-values ds)
  (dataset-attribute-values ds (dataset-label-idx ds)))

;;; Return a list of the record tag/index values
(define (dataset-record-indices ds)
  (indexed-matrix-row-tags (dataset-entries ds)))

;;; Return the number of records this dataset has.
(define (dataset-length ds)
  (indexed-matrix-length (dataset-entries ds)))

;;; Return the number of attributes this dataset has
;;; (does not including the label attribute)
(define (dataset-width ds)
  (1- (hash-table-size (dataset-attribute-table ds))))

(define (dataset-empty? ds)
  (or (= (dataset-length ds) 0)
      (= (dataset-width ds) 0)))

;; Add a new attribute ATTR to DS according to the
;; expression in PROC-EXP, which may contain
;; attribute tag names from DS.  PROC-EXP will be
;; applied record-wise over DS, and the value for
;; ATTR at that record will be the result of
;; evaluating PROC-EXP with tag names replaced by
;; the attribute value of that record.'
;;
;; E.g. ::
;;
;; (dataset-derive-attribute!
;;  d
;;  (make-numeric-attribute #:name 'foo)
;;  '(/ bar baz))
;;
(define (dataset-derive-attribute! ds attr proc-exp)
  (let ((name (attribute-name attr))
        (ents (dataset-entries ds)))
    (begin
      (dataset-attribute-set! ds name attr)
      (for-each
       (lambda (i)
         (indexed-matrix-set!
          ents
          (eval (substitute-map proc-exp (dataset-entry-value-alist ds i))
                (interaction-environment))
          i name))
       (indexed-matrix-row-tags ents))
      (set-attribute-domain! attr (dataset-attribute-values ds name)))))

(define (dataset-entry-value-alist ds rec-idx)
  (indexed-matrix-indexed-row (dataset-entries ds) rec-idx))

;; =================================================
;;       Reading a dataset from an ARFF file
;; =================================================

;; Produces a new attribute according to the ARFF
;; @attribute tag-line
(define (arff->attribute port)
  (define (attr-from-def name def)
    (cond ((equal? def "string")    (make-string-attribute #:name name))
          ((or (equal? def "numeric")
               (equal? def "real")) (make-numeric-attribute #:name name))
          ((equal? def "integer")   (make-integer-attribute #:name name))
          ((char=? #\{ (string-ref def 0))
           (let ((domain (string-split
                          (substring def 1 (1- (string-length def)))
                          #\,)))
             (make-nominal-attribute #:name name #:domain domain)))
          ((char=? #\[ (string-ref def 0))
           (let ((domain (string-split
                          (substring def 1 (1- (string-length def)))
                          #\,)))
             (make-ordinal-attribute #:name name #:domain domain)))))
  (define (bad-input in)
    (error (format #f "Cannot construct attribute from input: ~s\n" in)))
  (let* ((line (read-line port 'trim))
         (pieces (remove string-null? (string-split line #\ ))))
    (if (= (length pieces) 3)
        (let ((arff-tag (first pieces))
              (attr-name (string->symbol (second pieces)))
              (attr-def (third pieces)))
          (if (string-ci=? arff-tag "@attribute")
              (attr-from-def attr-name attr-def)
              (bad-input line))))))


;; Reads an ARFF-formatted data stream from PORT and
;; returns a new dataset with the contained data
;; (define* (arff->dataset #:optional (port (current-input-port)) . rest)
;;   (let ((ds (make <dataset>)))
;;     (begin
;;       ((cut set-from-arff! ds port <...>) rest)
;;       ds)))

;;; IGNORE-ATTRIBUTES and SELECT-ATTRIBUTES should be lists of symbols, which
;;; specify the attributes to ignore or select, respectively.
;;; SELECT-ATTRIBUTES may also be #t, in which case all attributes are
;;; selected.
;; (define* (arff->dataset #:optional
;;                      (port (current-input-port))
;;                      #:key
;;                      (ignore-attributes '())
;;                      (select-attributes #t)
;;                      (tag-index first)
;;                      (label-index last))
;;   (let ((ds (make <dataset>)))
;;     (begin
;;       (skip-comments-and-whitespace port)
;;       ;; Read relation tag and discard it
;;       (let ((line (read-line port)))
;;      (unless (string-match "@relation.*" line)
;;              (error (format #f "Expecting '@relation' tag but got ~s" line))))
;;       (skip-comments-and-whitespace port)
;;       ;; Now we expect a block of '@attribute' statements
;;       (let ((start-of-data? (let ((r (make-regexp "^@data.*$" regexp/icase)))
;;                            (cut regexp-exec r <>)))
;;          (attrs '()))
;;      (begin
;;        ;; First, read all the attribute definitions
;;        (do ((line (read-line port)
;;                   (begin (skip-comments-and-whitespace port)(read-line port))))
;;            ;; Stop once the '@data' tag has been read
;;            ((start-of-data? line))
;;          (let ((a (arff->attribute (open-input-string line))))
;;            (if (or (eq? select-attributes #t)
;;                    (member (tag a) select))
;;                (begin
;;                  (set! attrs (append! attrs (list a)))))))
;;        ;; Then, pick out those we're interested in
;;        (let* ((all-tags (map tag attrs))
;;               (select (cond
;;                        ((eq? select-attributes #t) all-tags)
;;                        ((eq? select-attributes #f) '())
;;                        (else select-attributes)))
;;               (ignore (cond
;;                        ((eq? ignore-attributes #f) '())
;;                        (else ignore-attributes)))
;;               (selected-tags
;;                (map (lambda (s)
;;                       (cond ((integer? s) (list-ref all-tags s))
;;                             (else
;;                              (let ((ss (as-symbol s)))
;;                                (if (member ss all-tags)
;;                                    ss
;;                                    (error
;;                                     (format #f
;;                                             "Selected tag ~s not an attribute"
;;                                             s)))))))
;;                     (lset-difference equal? select ignore)))
;;               (selected-indices
;;                (map (let ((tag-map (list->index-map all-tags)))
;;                       (cut assq-ref tag-map <>))
;;                     selected-tags))
;;               (ti (cond
;;                    ((procedure? tag-index) (tag-index selected-indices))
;;                    ((symbol? tag-index)
;;                     (list-index (cut eq? tag-index <>)
;;                                 all-tags))
;;                    (else tag-index)))
;;               (li (cond
;;                    ((procedure? label-index) (label-index selected-indices))
;;                    ((symbol? label-index)
;;                     (list-index (cut eq? label-index <>)
;;                                 all-tags))
;;                    (else label-index)))
;;               (label-attr (list-ref attrs li))
;;               (tag-label-list (if tag-index (list li ti) (list li)))
;;               (mask (lset-union! = selected-indices tag-label-list))
;;               (entry-attributes (take-indices
;;                                  attrs
;;                                  (lset-difference = mask (if tag-index
;;                                                              (list ti) '())))))
;;          (slot-set! (entries ds) 'col-tags
;;                     (map tag entry-attributes))
;;          (for-each (lambda (a) (dataset-attribute-set! ds (tag a) a))
;;                    entry-attributes)
;;          (set! (dataset-label-attribute ds) label-attr)
;;          ;; Now all the attributes are loaded in ATTRS,
;;          ;; create a dataset and set its values from
;;          ;; the block after '@data'
;;          (set-delimited! ds mask port #:delimiter #\,
;;                          #:tag-index ti #:label-index li))))
;;       ds)))

;;; Shorthand for reading lines from delimited input that may contain
;;; whitespace lines and comments
(define* (next-line port #:optional (handle-delim 'trim))
  (begin
    (skip-comments-and-whitespace port)
    (read-line port handle-delim)))

;;; Read a dataset from delimited text.
;;;
;;; ATTRIBUTES should be a list of attributes that describe the columns
;;; of the input data.  The length of ATTRIBUTES should be the same as
;;; the number of columns in the input data.  If any element of
;;; ATTRIBUTES is #f, then that column in the input will be ignored.
;;;
;;; If HEADER is #t, then assume there is a header line and read
;;; attribute indices from that.  Indices read in such a way will
;;; override any names/indices that the attributes in ATTRIBUTES already
;;; had.  If HEADER is #f then we assume that attributes already have
;;; names set.
(define* (delimited->dataset attributes
                             label-idx
                             #:optional
                             (port (current-input-port))
                             #:key
                             (delimiter #\,)
                             (header #t)
                             (rec-idx #f))
  (let* ((attribute-columns (list-indices attribute? attributes))
         (attributes* (filter attribute? attributes))
         (attribute-indices
          (if header
              ;; Read attribute indices from header line
              (let* ((line (next-line port))
                     (pieces (map (cute list-ref
                                        (string-split line delimiter)
                                        <>)
                                  attribute-columns))
                     (indices (map string->symbol pieces)))
                (map (lambda (attr idx)
                       (set-attribute-name! attr idx))
                     attributes*
                     indices)
                indices)
              (map attribute-name attributes)))
         ;; We need an input attribute for the record index column, but
         ;; that attribute should not be added to the dataset.
         (dataset-attrs (if rec-idx
                            (remove (lambda (a)
                                      (equal? (attribute-name a) rec-idx))
                                    attributes*)
                            attributes*))
         (attribute-map (map cons attribute-indices attribute-columns))
         (dataset (make-dataset dataset-attrs label-idx)))
    (set-delimited! dataset attribute-map port
                    #:delimiter delimiter
                    #:rec-idx rec-idx)
    dataset))

(define* (dataset->delimited dataset
                             #:optional
                             (port (current-output-port)))
  (let ((attr-indices (dataset-attribute-indices dataset)))
    (begin
      (format port "rec,~{~a~^,~}\n"
	      attr-indices)
      (for-each
       (lambda (rec-idx)
         (let ((values (dataset-entry-value-alist dataset rec-idx)))
           (format port "~a,~{~a~^,~}\n"
		   (symbol->string rec-idx)
		   (map (lambda (ai)
			  (assoc-ref values ai))
			attr-indices))))
       (dataset-record-indices dataset)))))

;;; ATTRIBUTE-MAP must be a list of pairs (attr-idx . attr-col) where
;;; ATTR-IDX is the index of an attribute in DATASET (if not part of
;;; DATASET it will be ignored) and ATTR-COL is the 0-based index at
;;; which values for that attribute reside in the delimited input.
;;; REC-IDX, if given, should name one of the attributes in
;;; ATTRIBUTE-MAP that is to be used to assign indices to records.  If
;;; not given each record will be assigned a "random" index.
(define* (set-delimited! dataset
                         ;; Maps from attribute index to the column in
                         ;; the delimited input where that attribute's
                         ;; values are found.  Should include the label
                         ;; attribute.
                         attribute-map  ;((attr-idx . attr-col) ...)
                         #:optional
                         (port (current-input-port))
                         #:key
                         (delimiter #\,)
                         (rec-idx #f))
  (let record-loop ((count 0))
    (let ((line (next-line port)))
      (if (not (eof-object? line))
          (let* ((str-values (map string-trim (string-split line delimiter)))
                 ;; Transform those string values into attribute values,
                 ;; and construct input suitable for
                 ;; set-dataset-entry-values!
                 (values (filter-map/key+value
                          (lambda (attr-idx attr-col)
                            (and=> (dataset-attribute dataset attr-idx)
                                   (lambda (attr)
                                     (cons attr-idx
                                           (attribute-make-value
                                            attr
                                            (list-ref str-values attr-col))))))
                          attribute-map))
                 (rec-name (string->symbol
                            (if rec-idx
                                (list-ref str-values
                                          (assoc-ref attribute-map rec-idx))
                                (string-append "rec" (number->string count))))))
            (set-dataset-entry-values! dataset rec-name values)
            (record-loop (1+ count)))))))

;;; For the record with index REC-IDX, set the values in VALUES, which
;;; must be an alist whose keys are attribute names/indices and whose
;;; values are the associated entry values for that attribute.  An entry
;;; with index REC-IDX may or may not already exist in DATASET.  VALUES
;;; must not necessarily contain a value for each attribute in DATASET,
;;; though if there are attribute indices in VALUES that are not part of
;;; DATASET then they will be ignored.
(define (set-dataset-entry-values! dataset rec-idx values)
  (let ((attributes (dataset-attribute-table dataset)))
   (for-each/key+value
    (lambda (attr-idx value)
      (if (hash-ref attributes attr-idx)
          (dataset-set! dataset value rec-idx attr-idx)))
    values)))

(define (skip-comments-and-whitespace port)
  (cond
   ((eof-object? (peek-char port)) (noop)) ;Nothing to be done
   ((char=? (peek-char port) #\%)
    (begin
      ;; Discard the comment line and continue
      (read-line port)
      (skip-comments-and-whitespace port)))
   ;; else check for whitespace-only lines
   (else
    (let ((line (read-line port 'concat)))
      (if (string-every char-set:whitespace line)
          ;; Discard this line and continue
          (skip-comments-and-whitespace port)
          ;; Else put the line back in port
          (unread-string line port))))))

;; Writing a dataset to an ARFF file
;; =================================

;; Write an attribute tag line for the given
;; attribute to PORT
(define (attribute->arff attr port)
  (display (string-append
            (format #f "@attribute ~a " (attribute-name attr))
            (let ((domain (attribute-domain attr)))
              (cond
               ;; Guess the arff attribute type from
               ;; the characteristics of the domain
               ;; values.
               ;;
               ;; TODO: So far this only works if
               ;; domain is a list.
               ((every string=? domain) "string")
               ((every symbol? domain)
                (format #f "{~{~a~^,~}}" domain))
               (else "string")))
            "\n")
           port))
;; (define-method (attribute->arff (attr <attribute>) port)
;;   (format port "@attribute ~a string\n" (tag attr)))
;; (define-method (attribute->arff (attr <string-attr>) port)
;;   (format port "@attribute ~a string\n" (tag attr)))
;; (define-method (attribute->arff (attr <numeric-attr>) port)
;;   (format port "@attribute ~a numeric\n" (tag attr)))
;; (define-method (attribute->arff (attr <nominal-attr>) port)
;;   (format port "@attribute ~a {~a}\n" (tag attr)
;;        (string-join (map as-string
;;                          (or (domain attr) '()))
;;                     ",")))
;; (define-method (attribute->arff (attr <ordinal-attr>) port)
;;   (format port "@attribute ~a [~a]\n" (tag attr)
;;        (string-join (map as-string
;;                          (or (domain attr) '()))
;;                     ",")))

;; (define* (dataset->arff ds #:optional (port (current-output-port)))
;;   (begin
;;     (when (slot-ref ds 'set-attr-domains)
;;        (dataset-set-domains! ds))
;;     (format port "% Dataset output by dataset.scm ~a\n"
;;          (date->string (time-utc->date (current-time))))
;;     (format port "@relation ~a\n\n"
;;          (let ((fn (and port (port-filename port))))
;;            (if fn
;;                ;; Pull out just the basename without extension
;;                (match:substring
;;                 (string-match "([^ /]+).arff" fn)
;;                 1)
;;                "foo")))
;;     (format port "@attribute tag string\n")
;;     (map (cut attribute->arff <> port)
;;       (dataset-attributes ds #:with-label #t))
;;     (format port "\n@data\n")
;;     (indexed-matrix-for-each-row
;;      (lambda (i elst)
;;        (format port "~a,~{~a~^,~}\n"
;;             (as-string i) (map as-string elst)))
;;      (entries ds))))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Tests

(use-modules (srfi srfi-64)
             (srfi srfi-1)
             (ice-9 format)
             (srfi srfi-43)             ;vector library
             (data-mining test-util))

(test-begin "dataset-test")

;;; Check helper routine attribute-list->attributes
(define attr-hash (attribute-list->attributes
                   `(integer
                     nominal
                     ,(make-string-attribute))))
(hash-for-each
 (lambda (key value)
   (test-assert (attribute? value)))
 attr-hash)

;;; Check creating a dataset with symbols for attributes
(define d0 (make-dataset `(integer string ,(make-nominal-attribute #:name 'class))
                         'class))
(test-assert (dataset? d0))
(test-eq "width with no entries" 2 (dataset-width d0))
(test-eq "length with no entries" 0 (dataset-length d0))
(test-eq "label-idx set" 'class (dataset-label-idx d0))

(define d1 (make-dataset '(nominal) 'class))
(test-assert (dataset-empty? d1))
(test-eq "width of empty dataset" 0 (dataset-width d1))

(define d2 (make-dataset `(,(make-integer-attribute #:name "i")
                           ,(make-string-attribute #:name "s")
                           ,(make-real-attribute #:name "r")
                           ,(make-nominal-attribute #:name "n"))
                         "n"))
(define d2/s0 (make-dataset/shared d2))
(test-assert "shared has all attributes"
             ((list-permutation? '("i" "s" "r" "n"))
              (dataset-attribute-indices d2/s0)))
(test-eq "width of shared same" 3 (dataset-width d2/s0))
(define d2/s1 (make-dataset/shared d2 #:columns '("i" "r" "n")))
(test-assert "shared has subset of attributes"
             ((list-permutation? '("i" "r" "n"))
              (dataset-attribute-indices d2/s1)))
(test-eq "width of narrower shared" 2 (dataset-width d2/s1))
;;; Add some entries and make sure it doesn't affect the original
(dataset-set! d2/s1 2 "r0" "i")
(dataset-set! d2/s1 2.71 "r0" "r")
(dataset-set! d2/s1 'foo "r0" "n")
(test-eq "ref" 2    (dataset-ref d2/s1 "r0" "i"))
(test-eq "ref" 2.71 (dataset-ref d2/s1 "r0" "r"))
(test-eq "ref" 'foo (dataset-ref d2/s1 "r0" "n"))
(test-eq "length of single record dataset" 1 (dataset-length d2/s1))
(test-eq "length of original dataset" 0 (dataset-length d2))
(test-eq "length of record indices" 1 (length (dataset-record-indices d2/s1)))
(test-assert "row indices"
             ((list-permutation? '("r0")) (dataset-record-indices d2/s1)))
(test-assert "label values"
             ((list-permutation? '(foo)) (dataset-label-attribute-values d2/s1)))
(test-assert "record values"
             ((list-permutation? '(("i" . 2) ("r" . 2.71) ("n" . foo)))
              (dataset-entry-value-alist d2/s1 "r0")))
;;; Create a new attribute and derive some values for it
(dataset-derive-attribute!
 d2/s1
 (make-numeric-attribute #:name "s*")
 '(/ "r" "i"))
(test-eq "length with derived" 1 (dataset-length d2/s1))
(test-eq "width with derived" 3 (dataset-width d2/s1))
(test-eq "width of original after shared derive"
         3 (dataset-width d2))
(define derived-value (assoc-ref (dataset-entry-value-alist d2/s1 "r0") "s*"))
(test-eqv "derived value" 1.355 derived-value)
(test-assert "original does not know about derived attribute"
             ((list-permutation? '("i" "r" "s" "n"))
              (dataset-attribute-indices d2)))
;;; Check set-dataset-entry-values!
(set-dataset-entry-values! d2 "r1"
                           '(("i" . 3) ("r" . 3.14) ("s" . "bust") ("n" . bar)))
(test-eq "shared datasets unchanged"
         0 (dataset-length d2/s0))
(test-eq "shared datasets unchanged"
         1 (dataset-length d2/s1))
(test-eq "set values for new record changes length"
         1 (dataset-length d2))
(test-assert "new record indices"
             ((list-permutation? '("r1"))
              (dataset-record-indices d2)))
;;; Update the entry for the "i" attribute
(set-dataset-entry-values! d2 "r1" '(("i" . 4)))
(test-eq "updating values does not change dataset length"
         1 (dataset-length d2))
(test-assert "updated record values"
             ((list-permutation?
               '(("i" . 4) ("r" . 3.14) ("s" . "bust") ("n" . bar)))
              (dataset-entry-value-alist d2 "r1")))
;;; Values for "unknown" attribute indices should be ignored
(set-dataset-entry-values! d2 "r1" '(("i" . 5) ("blaz" . flib)))
(test-eq "no change for unknown attribute index"
         1 (dataset-length d2))
(test-eq "no change for unknown attribute index"
         3 (dataset-width d2))
(test-assert "updated record values"
             ((list-permutation?
               '(("i" . 5) ("r" . 3.14) ("s" . "bust") ("n" . bar)))
              (dataset-entry-value-alist d2 "r1")))

;;; Check set-delimited!
(let* ((data "
a,classy,1,4,blurp,qu,0.2
b,classx,2,6,blub,qa,0.3")
       (foo (make-ordinal-attribute
             #:name 'foo
             #:read-value identity
             #:dissector-gen (cut ordinal-dissector
                                  <> <>
                                  string<? string=?)))
       (bar (make-string-attribute #:name 'bar))
       (baz (make-integer-attribute #:name 'baz))
       (bin (make-integer-attribute #:name 'bin))
       (bit (make-nominal-attribute
             #:name 'bit
             #:read-value string->symbol))
       (bug (make-string-attribute #:name 'bug))
       (baf (make-numeric-attribute #:name 'baf))
       (attribute-map `((foo . 0) (bar . 1) (baz . 2)
                        (bin . 3) (bit . 4) (bug . 5) (baf . 6)))
       (datasets
        (vector (make-dataset (list foo bar baz bin bit bug baf) 'bar)
                (make-dataset (list foo bar bin) 'bar)
                (make-dataset (list foo bar bit baf) 'bit)
                (make-dataset (list bar baz bin bug) 'bug))))
  (vector-for-each
   (lambda (i d args)
     (apply set-delimited! d attribute-map
            (open-input-string data) args))
   datasets
   '#(() () () (#:rec-idx foo)))
  ;; Check that the rec-idx-col option worked correctly
  (test-assert "rec-idx-col record indices"
               ((list-permutation? '(a b))
                (dataset-record-indices (vector-ref datasets 3))))
  (vector-for-each
   (lambda (i d)
     (test-eq (format #f "length of dataset ~a" i)
              2 (dataset-length d)))
   datasets)
  (vector-for-each
   (lambda (i d w)
     (test-eq (format #f "delimited width for dataset ~a" i)
              w (dataset-width d)))
   datasets
   #(6 2 3 3))
  (vector-for-each
   (lambda (i d v)
     (for-each
      (lambda (c)
        (test-assert (format #f "column values of ~a for dataset ~a"
                             (car c) i)
                     ((list-permutation? (cdr c))
                      (dataset-attribute-values d (car c)))))
      v))
   datasets
   '#(((foo . ("a" "b")) (bin . (4 6)) (bit . (blurp blub)))
      ((bar . ("classy" "classx")) (bin . (4 6)))
      ((baf . (0.2 0.3))))))

;;; Check delimited->dataset
(let* ((data "
foo,bar,baz,bin,bit,bug,baf
a,classy,1,4,blurp,qu,0.2
b,classx,2,6,blub,qa,0.3")
       ;; Attribute names should be derived from the header
       (attributes `(,(make-ordinal-attribute
                       #:read-value identity
                       #:dissector-gen (cut ordinal-dissector
                                            <> <>
                                            string<? string=?))
                     ,(make-string-attribute)
                     ,(make-integer-attribute)
                     ,(make-integer-attribute)
                     ,(make-nominal-attribute
                       #:read-value string->symbol)
                     ,(make-string-attribute)
                     ,(make-numeric-attribute)))
       (datasets `#(,(delimited->dataset attributes
                                         'foo ;label-idx
                                         (open-input-string data))
                    ,(delimited->dataset attributes
                                         'bar ;label-idx
                                         (open-input-string data))
                    ,(delimited->dataset attributes
                                         'baf
                                         (open-input-string data)
                                         #:rec-idx 'foo)
                    ,(delimited->dataset (list-mask attributes '(0 1 3 5))
                                         'bar
                                         (open-input-string data)
                                         #:rec-idx 'foo)
                    ,(delimited->dataset (list-mask attributes '(0 1 3 5 6))
                                         'bar
                                         (open-input-string data)
                                         #:rec-idx 'foo))))
  (vector-for-each
   (lambda (i d w)
     (test-eq w (dataset-width d)))
   datasets
   #(6 6 5 2 3))
  (vector-for-each
   (lambda (i d)
     (test-eq 2 (dataset-length d)))
   datasets)
  (vector-for-each
   (lambda (i d c)
     (test-assert (format #f "dataset ~a column names" i)
                  ((list-permutation? c)
                   (dataset-attribute-indices d)))
     (test-assert (format #f "dataset ~a column attributes" i)
		  ((list-permutation? c)
		   (map attribute-name
			(dataset-attributes d)))))
   datasets
   '#((foo bar baz bin bit bug baf)
      (foo bar baz bin bit bug baf)
      (foo bar baz bin bit bug baf)
      (bug bin bar)
      (baf bug bin bar)))
  (vector-for-each
   (lambda (i d v)
     (for-each
      (lambda (c)
        (test-assert (format #f "column values of ~a for dataset ~a"
                             (car c) i)
                     ((list-permutation? (cdr c))
                      (dataset-attribute-values d (car c)))))
      v))
   datasets
   '#(((foo . ("a" "b")) (bin . (4 6)) (bit . (blurp blub)))
      ((bar . ("classy" "classx")) (bin . (4 6)))
      ((baf . (0.2 0.3)))
      ((bug . ("qu" "qa")) (bin . (4 6)) (bar . ("classy" "classx")))
      ((baf . (0.2 0.3)) (bug . ("qu" "qa")))))
  (vector-for-each
   (lambda (i d l)
     (test-assert (format #f "label values for dataset ~a" i)
                  ((list-permutation? l)
                   (dataset-label-attribute-values d))))
   datasets
   '#(("a" "b")
      ("classy" "classx")
      (0.2 0.3)
      ("classy" "classx")
      ("classy" "classx")))
  ;; Check dataset-filter
  (vector-for-each
   (lambda (i d filters+assertions)
     (for-each
      (lambda (f)    ;f is a pair ((row-pred . col-pred) (assertions...))
        (let* ((row-pred (caar f))
               (col-pred (cdar f))
               (assertions (cdr f))
               (ds/f (dataset-filter d
                                     #:record-pred row-pred
                                     #:attribute-pred col-pred)))
          (for-each
           (lambda (assertion)
             (test-assert (assertion ds/f)))
           assertions)))
      filters+assertions))
   datasets
   `#((((#t #|row-pred|# . #t #|col-pred|#)
        ,(lambda (d) ((list-permutation? '(foo bar baz bin bit bug baf))
                      (dataset-attribute-indices d)))
        ,(lambda (d) ((list-permutation? '("a" "b"))
                      (dataset-label-attribute-values d))))
       ((,(lambda (ri vals) (string=? (assoc-ref vals 'bar) "classy")) . #t)
        ,(lambda (d) ((list-permutation? '("a"))
                      (dataset-label-attribute-values d)))
        ,(lambda (d) (= 1 (dataset-length d))))
       ((#t . ,(lambda (ai vals) (memq ai '(foo bar bin bit))))
        ,(lambda (d) ((list-permutation? '(foo bar bin bit))
                      (dataset-attribute-indices d))))
       ((,(lambda (ri vals) (< (assoc-ref vals 'baf) 0.25)) . #t)
        ,(lambda (d) (= 1 (dataset-length d))))))))

;;; Check dataset-partition-records

(let* ((data "
rec,data,class
0,0.1,a
1,0.12,a
2,0.09,a
3,0.21,b
4,0.18,b
5,0.11,a
6,0.121,a
7,0.23,b
8,0.04,c")
       (attributes `(,(make-nominal-attribute)
                     ,(make-numeric-attribute)))
       (dataset (delimited->dataset attributes
                                    'class
                                    (open-input-string data)
                                    #:rec-idx 'rec))
       (parts `(,(dataset-partition-records
                  `(,(lambda (ri vals)
                       (< (assoc-ref vals 'data) 0.06))
                    ,(lambda (ri vals)
                       (< (assoc-ref vals 'data) 0.15)))
                  dataset)
                ,(dataset-partition-records
                  `(,(lambda (ri vals)
                       (< (assoc-ref vals 'data) 0.12)))
                  dataset))))
  (for-each
   (lambda (part l)
     (test-eq "partition parts" l (length part)))
   parts
   '(3 2))
  (for-each
   (lambda (part data-sizes)
     (for-each
      (lambda (ds size)
        (test-eq "partition size"
                 size (dataset-length ds)))
      part data-sizes))
   parts
   '((1 5 3) (4 5))))

(test-end "dataset-test")

;; (define (test-dataset->arff)
;;   (begin
;;     (let ((d (make <dataset>))
;;        (d1 (make <dataset> #:attributes (list 'real 'integer 'ordinal 'nominal)))
;;        (counter 0))
;;       (begin
;;      (for-each
;;       (lambda (i)
;;         (for-each
;;          (lambda (j)
;;            (begin
;;              (dataset-set! d1 counter i j)
;;              (set! counter (1+ counter))))
;;          (col-tags (entries d1))))
;;       (list 'e0 'e1 'e2 'e3 'e4)))
;;       (dataset->arff d1))
;;     (newline)))

;; Try reading in a dataset from arff.  Output it to verify the contents.
;; (define (test-arff->dataset)
;;   (begin
;;    (let ((d (arff->dataset (open-input-string "
;; @relation bar
;; % This is a test dataset

;; @attribute tag string
;; @attribute funk string
;; @attribute foo numeric
;; @attribute bin numeric
;; @attribute bork [quick,quack]
;; @attribute frob {blurb,blip,blup}
;; @attribute label {a,b,c}

;; @data
;; e4,\"friz\",1.0,2.0,quick,blip,a
;; e3,\"fruz\",0.7,2.5,quick,blip,a
;; e2,\"frum\",1.2,2.0,quack,blurp,c
;; e1,\"fraz\",1.1,2.3,quack,blup,c
;; e0,\"frim\",1.6,2.9,quack,blup,b"))))
;;      (begin
;;        (dataset->arff d)
;;        (newline)))
;;    (let ((d (arff->dataset (open-input-string "
;; @relation bar

;; @attribute name string
;; @attribute class {a,b,c}
;; @attribute funk string
;; @attribute ignored [blarney,quack,silly]
;; @attribute val numeric

;; @data
;; e4,a,foo,friz,1.0
;; e3,b,bar,fratz,3.2
;; e2,b,biz,frumble,0.75
;; e1,c,bur,fram,0.01
;; e0,a,fit,frobble,10")
;;                         #:ignore-attributes (list 'ignored)
;;                         #:tag-index 'name
;;                         #:label-index 'class)))
;;      (begin
;;        (dataset->arff d)
;;        (newline)))))

;; (define (test-set-delimited!)
;;   (begin
;;     (let ((d (make <dataset> #:attributes (list 'real 'integer))))
;;       (begin
;;      (set-delimited! d (iota 4) (open-input-string "
;; e1,1.0,2,a
;; e3,2.0,3,b
;; e8,3.0,4,a
;; e4,4.0,1,r"))
;;      (dataset-set-domains! d)
;;      (dataset->arff d)
;;      (newline)))
;;     ;; Test another invocation, with a non-trivial column-mask, alternate
;;     ;; delimiter, and out-of-order tag and label indices.
;;     (let ((d (make <dataset> #:attributes (list 'real 'integer))))
;;       (begin
;;      (set-delimited! d (list 0 1 2 4) (open-input-string "
;; a:1.0:2:fuzz:e1
;; b:2.0:3:fizz:e3
;; a:3.0:4:fizz:e8
;; r:4.0:1:fuzz:e4")
;;                      #:tag-index 4 #:label-index 0
;;                      #:delimiter #\:)
;;      (dataset-set-domains! d)
;;      (dataset->arff d)
;;      (newline)))
;;     ;; This invocation will create record tags
;;     (let ((d (make <dataset> #:attributes (list 'nominal 'ordinal 'integer))))
;;       (begin
;;      (set-delimited! d (list 5 1 2 4) (open-input-string "
;; a,b,1,foo,10,classy
;; a,d,3,bar,20,classx
;; t,a,2,biz,5,classe")
;;                      #:tag-index #f #:label-index 5)
;;      (dataset-set-domains! d)
;;      (dataset->arff d)
;;      (newline)))))

;; (define (test-dataset-filter)
;;   (let ((d (make-test-dataset)))
;;     (dataset->arff (dataset-filter
;;                  d
;;                  #:record-pred (lambda (rt elts)
;;                                  (eq? (assoc-ref elts 'frob) 'blup))
;;                  #:attribute-pred #t))))

;; (define (make-test-dataset)
;;   (arff->dataset (open-input-string "
;; @relation bar
;; % This is a test dataset

;; @attribute tag string
;; @attribute funk string
;; @attribute foo numeric
;; @attribute bin numeric
;; @attribute bork [quick,quack]
;; @attribute frob {blurp,blip,blup}
;; @attribute label {a,b,c}

;; @data
;; e4,\"friz\",1.0,2.0,quick,blip,a
;; e3,\"fruz\",0.7,2.5,quick,blip,a
;; e2,\"frum\",1.2,2.0,quack,blurp,c
;; e1,\"fraz\",1.1,2.3,quack,blup,c
;; e0,\"frim\",1.6,2.9,quack,blup,b")))


;; Local Variables:
;; fill-column: 72
;; End:
